home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Sprite 1984 - 1993
/
Sprite 1984 - 1993.iso
/
src
/
machserver
/
1.098
/
recov
/
recovery.c
< prev
next >
Wrap
C/C++ Source or Header
|
1990-10-11
|
57KB
|
2,012 lines
/*
* recovery.c --
*
* The routines here maintain up/down state about other hosts.
* Other modules register as clients of the recovery module,
* and can then ask to be called back when some other host crashes
* or reboots. Modules always get called back when someone crashes,
* and then they have the option of being called back when the
* host reboots. Regular message traffic plus explicit pinging
* are used to track the state of the other hosts. Pinging is
* only done if some module is explicitly interested in a host.
*
* Recov_HostAlive and Recov_HostDead are used by RPC to tell us when
* a messages have arrived, or if transactions have timed out.
* Recov_IsHostDown is used to query the state of another host,
* Recov_RebootCallBack is used to get a callback upon a reboot, and
* Recov_WaitForHost is used to block a process until a host reboots.
* (Recov_WaitForHost isn't used much. Instead, modules rely on the
* recovery callbacks to indicate that a host is back to life, and
* they block processes in their own way.)
*
* Note: A synchronization hook is provided by Recov_HostAlive; its
* caller can be blocked if crash recovery actions are in progress.
*
* Copyright 1987 Regents of the University of California
* All rights reserved.
*/
#ifndef lint
static char rcsid[] = "$Header: /sprite/src/kernel/recov/RCS/recovery.c,v 9.19 90/10/11 14:10:41 kupfer Exp $ SPRITE (Berkeley)";
#endif /* not lint */
#include <sprite.h>
#include <recov.h>
#include <sync.h>
#include <net.h>
#include <rpc.h>
#include <hash.h>
#include <stdlib.h>
#include <trace.h>
#include <fsutil.h>
#include <bstring.h>
#include <stdio.h>
/*
* Other kernel modules arrange call-backs when a host crashes or reboots.
* The following list structure is used to keep these. The calling
* sequence of the callbacks is as follows:
* (*proc)(spriteID, clientData)
* Use Recov_CrashRegister and Recov_RebootRegister to set up the call backs.
*/
typedef struct {
List_Links links;
void (*proc)();
int refCount;
ClientData data;
} NotifyElement;
/*
* There is a single list of crash call backs, it isn't per machine
* like the reboot callbacks.
*/
static List_Links crashCallBackList;
/*
* recov_CrashDelay is the grace period given when another host
* is apparently down. Reboots are still detected so that
* the crash callbacks will get called to clean up.
*/
unsigned int recov_CrashDelay;
/*
* Statistics about the recovery module.
*/
Recov_Stats recov_Stats;
/*
* For per-client statistics about recovery on the server.
* This amounts to a per-host list, in array form.
* Each host has numTries elements in the array. The spriteID and numTries
* fields are only initialized in the first element.
*/
typedef struct RecovPerHostInfo {
int spriteID; /* Sprite ID of client. */
Time start; /* First recovery attempt. */
Time finished; /* First recovery attempt finished. */
int numTries; /* Number of recovery attempts. */
int numHandles; /* Number of reopens requested. */
int numSuccessful; /* Handles successfully recovered. */
} RecovPerHostInfo;
/*
* The state of other hosts is kept in a hash table keyed on SpriteID.
* This state is maintained by Recov_HostAlive and Recov_HostDead, which are
* called in turn after packet reception or RPC timeout, respectively.
* Recov_HostDead is also called by the Rpc_Daemon if it can't get an
* explicit acknowledgment from a client.
*/
static Hash_Table recovHashTableStruct;
static Hash_Table *recovHashTable = &recovHashTableStruct;
typedef struct RecovStampList {
List_Links timeStampList;
Timer_Ticks start;
Timer_Ticks finished;
int numHandles; /* Handles since last time. */
int numSuccessful; /* Successful last time. */
} RecovStampList;
typedef struct RecovHostState {
int state; /* flags defined below */
int clientState; /* flags defined in recov.h */
int spriteID; /* Sprite Host ID */
unsigned int bootID; /* Boot timestamp from RPC header */
Time time; /* Time of last message */
Sync_Condition alive; /* Notified when host comes up */
Sync_Condition recovery; /* Notified when recovery is complete */
List_Links rebootList; /* List of callbacks for when this
* host reboots. */
int numFailures; /* Times a failure occurs during the
* reboot callbacks. Such a failure
* triggers a retry of the reboot
* callbacks. */
/*
* The following fields are used in the tracing of the recovery module.
*/
Timer_Ticks start; /* Time that recovery is started. */
Timer_Ticks finished; /* Time recovery attempt finishes. */
int numTries; /* Number of times recov attempted. */
int numHandles; /* Handles requested. */
int numSuccessful; /* Successful handles. */
int currentHandles; /* Temporary info. */
int currentSuccessful;
List_Links timeStampList; /* List of time stamps for recovery. */
} RecovHostState;
#define RECOV_INIT_HOST(hostPtr, zspriteID, zstate, zbootID) \
hostPtr = (RecovHostState *) malloc(sizeof (RecovHostState)); \
(void)bzero((Address)hostPtr, sizeof(RecovHostState)); \
List_Init(&(hostPtr)->rebootList); \
List_Init(&(hostPtr)->timeStampList);\
(hostPtr)->spriteID = zspriteID; \
(hostPtr)->state = zstate; \
(hostPtr)->bootID = zbootID; \
(hostPtr)->numFailures = 0;
/*
* Access to the hash table is monitored.
*/
static Sync_Lock recovLock;
#define LOCKPTR (&recovLock)
/*
* recov_PrintLevel defines how noisey we are about other hosts.
* Values for the print level should be defined in increasing order.
*/
int recov_PrintLevel = RECOV_PRINT_REBOOT;
#define RecovHostPrint(level, spriteID, message) \
if (recov_PrintLevel >= level) { \
Sys_HostPrint(spriteID, message); \
}
Trace_Header recovTraceHdr;
Trace_Header *recovTraceHdrPtr = &recovTraceHdr;
int recovTraceLength = 50;
Boolean recovTracing = TRUE;
/*
* Forward declarations.
*/
static void CrashCallBacks _ARGS_((ClientData data, Proc_CallInfo *callInfoPtr));
#ifdef dying_state
static void DelayedCrashCallBacks _ARGS_((ClientData data, Proc_CallInfo *callInfoPtr));
static void MarkHostDead _ARGS_((int spriteID));
#endif /* dying_state */
static void CallBacksDone _ARGS_((int spriteID));
static void MarkRecoveryComplete _ARGS_((int spriteID));
static void GetRebootList _ARGS_((List_Links *notifyListHdr, int spriteID));
static char *GetState _ARGS_((int state));
static void PrintExtraState _ARGS_((RecovHostState *hostPtr));
/*
*----------------------------------------------------------------------
*
* Recov_Init --
*
* Set up the data structures used by the recovery module.
*
* Results:
* None.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
void
Recov_Init()
{
Sync_LockInitDynamic(&recovLock, "Recov:recovLock");
Hash_Init(recovHashTable, 8, HASH_ONE_WORD_KEYS);
List_Init(&crashCallBackList);
Trace_Init(recovTraceHdrPtr, recovTraceLength,
sizeof(RecovTraceRecord), 0);
recov_CrashDelay = (unsigned int)(timer_IntOneMinute);
RecovPingInit();
return;
}
/*
*----------------------------------------------------------------------
*
* Recov_CrashRegister --
*
* This procedure is used to register a crash callback procedure.
* This is typically done once at boot time by each module that
* is interested in learning about the failure of other hosts.
* When other hosts are (apparently) down the recovery module
* calls back to other modules that have registered via this procedure.
* This allows those other modules to clean up any state associated
* with the crashed host.
*
* Results:
* None.
*
* Side effects:
* Callback entry added to the crash call-back list.
*
*----------------------------------------------------------------------
*/
void
Recov_CrashRegister(crashCallBackProc, crashData)
void (*crashCallBackProc)();
ClientData crashData;
{
register NotifyElement *notifyPtr;
notifyPtr = (NotifyElement *) malloc(sizeof (NotifyElement));
notifyPtr->proc = crashCallBackProc;
notifyPtr->data = crashData;
List_InitElement((List_Links *) notifyPtr);
List_Insert((List_Links *) notifyPtr, LIST_ATREAR(&crashCallBackList));
return;
}
/*
*----------------------------------------------------------------------
*
* Recov_RebootRegister --
*
* Schedule a callback for when a particular host reboots.
* To make sure we detect a crash, the recovery module has to
* periodically check on the state of the target host.
*
* Results:
* None.
*
* Side effects:
* This initiate a background callback to check-up on the host's state.
*
*----------------------------------------------------------------------
*/
ENTRY void
Recov_RebootRegister(spriteID, rebootCallBackProc, rebootData)
int spriteID;
void (*rebootCallBackProc)();
ClientData rebootData;
{
Hash_Entry *hashPtr;
RecovHostState *hostPtr;
register NotifyElement *notifyPtr;
Boolean found = FALSE;
LOCK_MONITOR;
if (spriteID <= 0 || spriteID == rpc_SpriteID) {
panic("Recov_RebootRegister, bad hostID %d\n", spriteID);
} else {
hashPtr = Hash_Find(recovHashTable, (Address)spriteID);
if (hashPtr->value == (Address)NIL) {
RECOV_INIT_HOST(hostPtr, spriteID, RECOV_STATE_UNKNOWN, 0);
hashPtr->value = (Address)hostPtr;
} else {
hostPtr = (RecovHostState *)hashPtr->value;
}
/*
* Save the callback while avoiding duplications.
*/
LIST_FORALL(&hostPtr->rebootList, (List_Links *)notifyPtr) {
if (notifyPtr->proc == rebootCallBackProc &&
notifyPtr->data == rebootData) {
found = TRUE;
break;
}
}
if (!found) {
notifyPtr = (NotifyElement *) malloc(sizeof (NotifyElement));
notifyPtr->proc = rebootCallBackProc;
notifyPtr->data = rebootData;
notifyPtr->refCount = 1;
List_InitElement((List_Links *)notifyPtr);
List_Insert((List_Links *)notifyPtr,
LIST_ATFRONT(&hostPtr->rebootList));
} else {
notifyPtr->refCount++;
}
/*
* Mark the host as being interesting, and add it to the ping
* list if necessary.
*/
hostPtr->state |= RECOV_PINGING_HOST;
RecovAddHostToPing(spriteID);
}
UNLOCK_MONITOR;
return;
}
/*
*----------------------------------------------------------------------
*
* Recov_RebootUnRegister --
*
* Remove a callback for when a particular host reboots. This is
* used after we are no longer interested in a host rebooting.
*
* Results:
* None.
*
* Side effects:
* Nukes the reboot procedure. If all interested parties remove their
* reboot callbacks then the periodic check of the other host is
* stopped.
*
*----------------------------------------------------------------------
*/
ENTRY void
Recov_RebootUnRegister(spriteID, rebootCallBackProc, rebootData)
int spriteID;
void (*rebootCallBackProc)();
ClientData rebootData;
{
Hash_Entry *hashPtr;
RecovHostState *hostPtr;
register NotifyElement *notifyPtr;
Boolean found = FALSE;
LOCK_MONITOR;
if (spriteID <= 0 || spriteID == rpc_SpriteID) {
panic("Recov_RebootUnRegister, bad hostID %d\n", spriteID);
} else {
hashPtr = Hash_Find(recovHashTable, (Address)spriteID);
if (hashPtr->value == (Address)NIL) {
RECOV_INIT_HOST(hostPtr, spriteID, RECOV_STATE_UNKNOWN, 0);
hashPtr->value = (Address)hostPtr;
} else {
hostPtr = (RecovHostState *)hashPtr->value;
}
/*
* Look for the matching callback.
*/
LIST_FORALL(&hostPtr->rebootList, (List_Links *)notifyPtr) {
if (notifyPtr->proc == rebootCallBackProc &&
notifyPtr->data == rebootData) {
found = TRUE;
break;
}
}
if (found) {
notifyPtr->refCount--;
if (notifyPtr->refCount <= 0) {
int num;
/*
* Mousetrap for debugging recovery reference count problem.
*/
if (notifyPtr->proc == (void((*)())) Fsutil_Reopen) {
if (recov_PrintLevel >= RECOV_PRINT_CRASH) {
printf(
"Recov: deleting Fsutil_Reopen for server %d ref count %d\n",
spriteID, notifyPtr->refCount);
}
/*
* We want to panic if we still have handles for
* this server.
*/
num = Fsutil_TestForHandles(spriteID);
/*
* This routine is called before the handle is removed,
* so we must take into account the fact that it still
* exists in the handle table.
*/
if (num > 1) {
printf("%d file and device handles remain\n", num);
panic("Shouldn't have deleted it - handles remain!\n");
}
}
List_Remove((List_Links *)notifyPtr);
free((Address)notifyPtr);
}
}
}
UNLOCK_MONITOR;
return;
}
/*
*----------------------------------------------------------------------
*
* Recov_HostAlive --
*
* Mark the host as being alive. This is called when we've received
* a message from the host. It uses state from the host table and
* the bootID parameter to detect reboots. If a reboot is detected,
* but we thought the host was up, then the Crash call-backs are invoked.
* In any case, a reboot invokes the Reboot call-backs, if any.
*
* This procedure is called from client RPC upon successful completion
* of an RPC, and by server RPC upon reciept of a client request.
* These two cases are identified by the 'asyncRecovery' parameter.
* Servers want synchronous recovery so they don't service anything
* until state associated with that client has been cleaned up via
* the Crash call-backs. So Recov_HostAlive blocks (if !asyncRecovery)
* until the crash call-backs are complete. Clients don't have the
* same worries so they let the crash call-backs complete in the
* background (asyncRecovery is TRUE).
*
* Results:
* None.
*
* Side effects:
* Updates the boot timestamp of the other host. Procedures installed
* with Recov_CrashRegister are called when the bootID changes. A
* timestamp of when this message was received is obtained from the
* "cheap" clock so we can tell later if there has been recent message
* traffic.
*
*----------------------------------------------------------------------
*/
ENTRY void
Recov_HostAlive(spriteID, bootID, asyncRecovery, rpcNotActive)
int spriteID; /* Host ID of the message sender */
unsigned int bootID; /* Boot time stamp from message header */
Boolean asyncRecovery; /* TRUE means do recovery call-backs in
* the background. FALSE causes the process
* to wait until crash recovery is complete. */
Boolean rpcNotActive; /* This is a flag propogated from the rpc
* packet header. If set it means the RPC
* system on the remote host isn't fully
* turned on. Reboot recovery is delayed
* until this changes. */
{
register Hash_Entry *hashPtr;
register RecovHostState *hostPtr;
LOCK_MONITOR;
if (spriteID == NET_BROADCAST_HOSTID || bootID == 0 || sys_ShuttingDown) {
/*
* Don't track the broadcast address. Also ignore zero valued
* bootIDs. These come from hosts at early boot time, or
* in certain error conditions like trying to send too much
* data in a single RPC. Also don't bother to check things
* where we are shutting down the system because we don't want
* RPCs for the cache data to get blocked.
*/
UNLOCK_MONITOR;
return;
}
recov_Stats.packets++;
hashPtr = Hash_Find(recovHashTable, (Address)spriteID);
if (hashPtr->value == (Address)NIL) {
/*
* Initialize the host's state. This is the first time we've talked
* to it since we've been up, so take no action.
*/
RECOV_INIT_HOST(hostPtr, spriteID, RECOV_HOST_ALIVE, bootID);
hashPtr->value = (Address)hostPtr;
RecovHostPrint(RECOV_PRINT_IF_UP, spriteID, "is up\n");
RECOV_TRACE(spriteID, RECOV_HOST_ALIVE, RECOV_CUZ_INIT);
} else {
hostPtr = (RecovHostState *)hashPtr->value;
}
/*
* Have to read the clock in order to suppress repeated pings,
* see Recov_GetHostState and Recov_IsHostDown.
*/
Timer_GetTimeOfDay(&hostPtr->time, (int *)NIL, (Boolean *)NIL);
/*
* Check for a rebooted peer by comparing boot time stamps.
*/
if (hostPtr->bootID != bootID) {
if (hostPtr->bootID != 0) {
RecovHostPrint(RECOV_PRINT_REBOOT, spriteID, "rebooted\n");
} else {
/*
* We initialized state before talking to the host the first time.
* The state is 'unknown' so we won't do crash call-backs.
*/
}
hostPtr->bootID = bootID;
RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_REBOOT);
if (hostPtr->state &
(RECOV_HOST_ALIVE|RECOV_HOST_DYING|RECOV_HOST_BOOTING)) {
RecovHostPrint(RECOV_PRINT_ALL, spriteID,
"Undetected crash occurred.\n");
/*
* A crash occured un-detected. We do the crash call-backs
* first, and block server processes in the meantime.
* RECOV_CRASH_CALLBACKS flag is cleared by CrashCallBacks.
*/
hostPtr->state &=
~(RECOV_HOST_ALIVE|RECOV_HOST_DYING|RECOV_HOST_DEAD);
hostPtr->state |= RECOV_HOST_BOOTING;
RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_CRASH_UNDETECTED);
if ((hostPtr->state & RECOV_CRASH_CALLBACKS) == 0) {
hostPtr->state |= RECOV_CRASH_CALLBACKS;
RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_CRASH_UNDETECTED);
Proc_CallFunc(CrashCallBacks, (ClientData)spriteID, 0);
}
}
} else if ( ! (hostPtr->state &
(RECOV_CRASH_CALLBACKS|RECOV_WANT_RECOVERY)) &&
(hostPtr->state & RECOV_HOST_ALIVE)) {
/*
* Fast path. We already think the other host is up, it didn't
* reboot, we don't want recovery, and there are no pending
* crash call-backs to synchronize with.
*/
goto exit;
}
/*
* Block servers until crash recovery actions complete.
* This prevents servicing requests from clients until after the
* recovery actions complete.
*/
if (! asyncRecovery) {
RecovHostPrint(RECOV_PRINT_ALL, spriteID, "Async recovery false.\n");
while (hostPtr->state & RECOV_CRASH_CALLBACKS) {
(void)Sync_Wait(&hostPtr->recovery, FALSE);
if (sys_ShuttingDown) {
UNLOCK_MONITOR;
Proc_Exit(1);
}
}
}
/*
* Now that we've taken care of crash recovery, we see if the host
* is newly up. If so, invoke any reboot call-backs and notify
* waiting processes. This means clientA (us) may start
* re-opening files from serverB (the other guy) at the same time
* as clientA (us) is closing files that serverB had had open.
* ie. both the crash and reboot call backs may proceed in parallel.
*/
switch(hostPtr->state &
(RECOV_HOST_ALIVE|RECOV_HOST_BOOTING|RECOV_HOST_DEAD|RECOV_HOST_DYING)) {
case RECOV_STATE_UNKNOWN: /* This is zero, no bits set */
/*
* We have uninitialized state for the host, mark it alive.
*/
RecovHostPrint(RECOV_PRINT_IF_UP, spriteID, "is up\n");
if (rpcNotActive) {
hostPtr->state |= RECOV_HOST_BOOTING;
} else {
hostPtr->state |= RECOV_HOST_ALIVE;
}
break;
case RECOV_HOST_ALIVE:
/*
* Host already alive. We may still want recovery at this
* point. See CallBacksDone.
*/
RecovHostPrint(RECOV_PRINT_ALL, spriteID, "Already up.\n");
break;
case RECOV_HOST_BOOTING:
/*
* See if a booting host is ready yet.
*/
RecovHostPrint(RECOV_PRINT_ALL, spriteID, "Booting, set recov.\n");
if (! rpcNotActive) {
hostPtr->state &= ~RECOV_HOST_BOOTING;
hostPtr->state |= RECOV_HOST_ALIVE|RECOV_WANT_RECOVERY;
RecovHostPrint(RECOV_PRINT_ALL, spriteID,
"Booting, set alive, recov.\n");
}
break;
case RECOV_HOST_DYING:
case RECOV_HOST_DEAD:
/*
* See if the host is newly booting or back from a net partition.
*/
if (rpcNotActive) {
hostPtr->state |= RECOV_HOST_BOOTING;
RecovHostPrint(RECOV_PRINT_ALL, spriteID,
"Dead or dying, set booting.\n");
} else {
hostPtr->state |= (RECOV_HOST_ALIVE|RECOV_WANT_RECOVERY);
RecovHostPrint(RECOV_PRINT_ALL, spriteID,
"Dead, dying, set want recov.\n");
}
hostPtr->state &= ~(RECOV_HOST_DEAD|RECOV_HOST_DYING);
break;
default:
printf("Unexpected recovery state <%x> for ", hostPtr->state);
Sys_HostPrint(spriteID, "\n");
break;
}
/*
* After a host comes up enough to support RPC service, we
* initiate reboot recovery if needed.
*/
if ((hostPtr->state & RECOV_WANT_RECOVERY) &&
(hostPtr->state & RECOV_HOST_ALIVE) &&
(hostPtr->state & RECOV_REBOOT_CALLBACKS) == 0) {
hostPtr->state &= ~RECOV_WANT_RECOVERY;
hostPtr->state |= RECOV_REBOOT_CALLBACKS;
RecovHostPrint(RECOV_PRINT_ALL, spriteID,
"Want recov, etc, callbacks.\n");
Proc_CallFunc(RecovRebootCallBacks, (ClientData)spriteID, 0);
}
exit:
UNLOCK_MONITOR;
return;
}
/*
*----------------------------------------------------------------------
*
* Recov_HostDead --
*
* Change the host's state to "dead". This is called from client RPC
* when an RPC timed out with no response. It is also called by the
* Rpc_Daemon when it can't recontact a client to get an explicit
* acknowledgment.
*
* Results:
* None.
*
* Side effects:
* If the host was previously thought up, this sets the state in
* the host state table to dead and invokes the crash callbacks.
*
*----------------------------------------------------------------------
*/
ENTRY void
Recov_HostDead(spriteID)
int spriteID;
{
register Hash_Entry *hashPtr;
register RecovHostState *hostPtr;
LOCK_MONITOR;
if (spriteID == NET_BROADCAST_HOSTID || rpc_NoTimeouts) {
/*
* If rpcNoTimeouts is set the Rpc_Daemon may still call us if
* it can't get an acknowledgment from a host to close down
* a connection. We ignore this so that we don't take action
* against the offending host (who is probably in the debugger)
* (Hmm, it doesn't look like Rpc_Daemon calls this procedure.)
*/
UNLOCK_MONITOR;
return;
}
recov_Stats.timeouts++;
hashPtr = Hash_Find(recovHashTable, (Address)spriteID);
if (hashPtr->value == (Address)NIL) {
RECOV_INIT_HOST(hostPtr, spriteID, RECOV_HOST_DEAD, 0);
hashPtr->value = (Address)hostPtr;
} else {
hostPtr = (RecovHostState *)hashPtr->value;
}
switch(hostPtr->state &
(RECOV_HOST_ALIVE|RECOV_HOST_BOOTING|RECOV_HOST_DEAD)) {
case RECOV_HOST_DEAD:
case RECOV_HOST_DYING:
/*
* Host already dead or dying.
*/
break;
case RECOV_STATE_UNKNOWN:
case RECOV_HOST_BOOTING:
case RECOV_HOST_ALIVE:
hostPtr->state &=
~(RECOV_HOST_ALIVE|RECOV_HOST_BOOTING);
/*
* Special handling if we abort during the recovery protocol.
* In this case it is possible for the other host to go from
* alive to dead and back to alive before the recovery protocol
* finally terminates. If that happens we could loose a reboot
* event and fail to initiate recovery again. We mark the
* host specially so the reboot callbacks are retried.
*/
if (hostPtr->state & RECOV_REBOOT_CALLBACKS) {
hostPtr->state |= RECOV_FAILURE;
}
/*
* After an RPC timeout (which is already logged by RPC to syslog)
* make the crash call backs. These are made after a delay
* if dying_state is defined. This helps smooth over temporary
* communication failures.
*
*/
#ifdef dying_state
hostPtr->state |= RECOV_HOST_DYING;
Proc_CallFunc(DelayedCrashCallBacks, (ClientData)spriteID,
recov_CrashDelay);
#else
hostPtr->state |= RECOV_HOST_DEAD|RECOV_CRASH_CALLBACKS;
RecovHostPrint(RECOV_PRINT_CRASH, spriteID,
"crash call-backs made\n");
RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_CRASH);
Proc_CallFunc(CrashCallBacks, (ClientData)spriteID, 0);
#endif
break;
}
UNLOCK_MONITOR;
return;
}
/*
*----------------------------------------------------------------------
*
* Recov_IsHostDown --
*
* This decides if the specified host is down. If the host is known
* to be down this routine returns FAILURE. SUCCESS is returned if
* the host is alive, and RPC_SERVICE_DISABLED is returned if the
* host is in its boot sequence and can't service RPC's yet. If there
* hasn't been recent (within the last 10 seconds) message traffic
* this this pings the host to find out for sure its state.
*
* Results:
* SUCCESS if the host is up, FAILURE if it doesn't respond to
* pings or is known to be down, and RPC_SERVICE_DISABLED if
* the host says so.
*
* Side effects:
* May do a ping.
*
*----------------------------------------------------------------------
*/
ReturnStatus
Recov_IsHostDown(spriteID)
int spriteID;
{
register ReturnStatus status = SUCCESS;
if (spriteID == NET_BROADCAST_HOSTID) {
printf("Warning: Recov_IsHostDown, got broadcast address\n");
return(SUCCESS);
}
switch (Recov_GetHostState(spriteID)) {
case RECOV_STATE_UNKNOWN:
RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, RECOV_CUZ_PING_ASK);
recov_Stats.pings++;
status = Rpc_Ping(spriteID);
break;
case RECOV_HOST_BOOTING:
case RECOV_HOST_ALIVE:
case RECOV_HOST_DYING: /* fake it to allow for the grace period */
recov_Stats.pingsSuppressed++;
status = SUCCESS;
break;
case RECOV_HOST_DEAD:
status = FAILURE;
break;
}
return(status);
}
/*
*----------------------------------------------------------------------
*
* Recov_HostTrace --
*
* Add an entry to the recovery trace.
*
* Results:
* None.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
ENTRY void
Recov_HostTrace(spriteID, event)
int spriteID;
int event;
{
/*
* No monitor lock needed here, since Trace_Insert does its own
* synchronization.
*/
RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, event);
}
/*
*----------------------------------------------------------------------
*
* Recov_GetClientState --
*
* Return the client state associated with a host. The recovery host
* table is a convenient object keyed on spriteID. Other modules can
* set their own state in the table (beyond the simple up/down state
* mainted by the rest of this module), and retrieve it with this call.
*
* Results:
* A copy of the clientState field. 0 is returned if there is no
* host table entry.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
ENTRY int
Recov_GetClientState(spriteID)
int spriteID;
{
Hash_Entry *hashPtr;
RecovHostState *hostPtr;
int stateBits = 0;
LOCK_MONITOR;
hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
if (hashPtr != (Hash_Entry *)NIL) {
hostPtr = (RecovHostState *)hashPtr->value;
if (hostPtr != (RecovHostState *)NIL) {
stateBits = hostPtr->clientState;
}
}
UNLOCK_MONITOR;
return(stateBits);
}
/*
*----------------------------------------------------------------------
*
* Recov_SetClientState --
*
* Set a client state bit. This or's the parameter into the
* client state word. The previous value of the client state
* word is returned so this procedure can be used like test-and-set.
*
* Results:
* None.
*
* Side effects:
* Sets bits in the clientState field of the host state. This will add
* an entry to the host table if one doesn't alreay exist. Its RPC
* up/down state is set to "unknown" in this case.
*
*----------------------------------------------------------------------
*/
ENTRY int
Recov_SetClientState(spriteID, stateBits)
int spriteID;
int stateBits;
{
Hash_Entry *hashPtr;
RecovHostState *hostPtr;
register oldState;
RecovStampList *stampPtr;
LOCK_MONITOR;
hashPtr = Hash_Find(recovHashTable, (Address)spriteID);
hostPtr = (RecovHostState *)hashPtr->value;
if (hostPtr == (RecovHostState *)NIL) {
RECOV_INIT_HOST(hostPtr, spriteID, RECOV_STATE_UNKNOWN, 0);
hashPtr->value = (Address)hostPtr;
}
if ((stateBits & CLT_RECOV_IN_PROGRESS) != 0) {
if (hostPtr->numTries == 0) {
/* First recovery attempt */
if ((hostPtr->clientState & CLT_RECOV_IN_PROGRESS) != 0) {
printf("No recovery attempt yet, but marked as in progress.");
}
Timer_GetCurrentTicks(&hostPtr->start);
} else {
/* Add a time-stamp to the recovery list. */
stampPtr = (RecovStampList *) malloc(sizeof (RecovStampList));
Timer_GetCurrentTicks(&stampPtr->start);
List_InitElement((List_Links *) stampPtr);
List_Insert((List_Links *) stampPtr,
LIST_ATREAR(&hostPtr->timeStampList));
/*
* Clear handle count for this round.
*/
hostPtr->currentHandles = 0;
hostPtr->currentSuccessful = 0;
}
hostPtr->numTries++;
}
oldState = hostPtr->clientState;
hostPtr->clientState |= stateBits;
UNLOCK_MONITOR;
return(oldState);
}
/*
*----------------------------------------------------------------------
*
* Recov_ClearClientState --
*
* Clear client state bits.
*
* Results:
* None.
*
* Side effects:
* Clears bits in the clientState field of the host state. This does
* nothing if the state doesn't exist.
*
*----------------------------------------------------------------------
*/
ENTRY void
Recov_ClearClientState(spriteID, stateBits)
int spriteID;
int stateBits;
{
register Hash_Entry *hashPtr;
register RecovHostState *hostPtr = (RecovHostState *) NIL;
RecovStampList *stampPtr;
LOCK_MONITOR;
hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
if (hashPtr != (Hash_Entry *)NIL) {
hostPtr = (RecovHostState *)hashPtr->value;
if (hostPtr != (RecovHostState *)NIL) {
hostPtr->clientState &= ~stateBits;
}
}
/* End of recovery? */
if ((hostPtr != (RecovHostState *) NIL) &&
(stateBits & CLT_RECOV_IN_PROGRESS) != 0) {
/* End of 1st recovery try? */
if (hostPtr->numTries <= 1) {
Timer_GetCurrentTicks(&hostPtr->finished);
/* Final count of handles recovered is in hostPtr. */
hostPtr->numHandles = hostPtr->currentHandles;
hostPtr->numSuccessful = hostPtr->currentSuccessful;
} else {
if (List_IsEmpty(&hostPtr->timeStampList)) {
printf("Recov_ClearClientState: timeStampList is empty!\n");
hostPtr->numSuccessful = 0; /* signal the error */
} else {
stampPtr = (RecovStampList *)
List_Last((List_Links *) &hostPtr->timeStampList);
Timer_GetCurrentTicks(&stampPtr->finished);
stampPtr->numHandles = hostPtr->currentHandles;
stampPtr->numSuccessful = hostPtr->currentSuccessful;
}
}
}
UNLOCK_MONITOR;
return;
}
/*
*----------------------------------------------------------------------
*
* Recov_AddHandleCountToClientState --
*
* Increment count of handles reopened from this client.
*
* Results:
* None.
*
* Side effects:
* Data in per-host recovery info updated.
*
*----------------------------------------------------------------------
*/
ENTRY void
Recov_AddHandleCountToClientState(type, clientID, status)
int type; /* Type of handle being reopened. */
int clientID; /* Id of client requesting reopen. */
ReturnStatus status; /* Whether the reopen succeeded. */
{
register Hash_Entry *hashPtr;
register RecovHostState *hostPtr = (RecovHostState *) NIL;
LOCK_MONITOR;
hashPtr = Hash_LookOnly(recovHashTable, (Address)clientID);
if (hashPtr != (Hash_Entry *)NIL) {
hostPtr = (RecovHostState *)hashPtr->value;
if (hostPtr != (RecovHostState *)NIL) {
hostPtr->currentHandles++;
if (status == SUCCESS) {
hostPtr->currentSuccessful++;
}
}
}
UNLOCK_MONITOR;
return;
}
/*
*----------------------------------------------------------------------
*
* Recov_DumpClientRecovInfo --
*
* Dump out some of the recovery statistics in the per-host info.
*
* Results:
* Returns FAILURE if recovery still in progress. Returns SUCCESS
* otherwise.
*
* Side effects:
* Info copied into buffer. Size of needed buffer also copied out.
*
*----------------------------------------------------------------------
*/
ENTRY ReturnStatus
Recov_DumpClientRecovInfo(length, resultPtr, lengthNeededPtr)
int length; /* size of data buffer */
Address resultPtr; /* Array of info structs. */
int *lengthNeededPtr; /* to return space needed */
{
Hash_Entry *hashPtr;
RecovHostState *hostPtr;
Hash_Search hashSearch;
RecovPerHostInfo *infoPtr;
int numNeeded;
int numAvail;
LOCK_MONITOR;
/*
* If recovery still going on, return FAILURE.
* NOTE: This isn't a sure-fire test. I'm not sure there is one right now.
*/
if (fsutil_NumRecovering >= 1) {
UNLOCK_MONITOR;
return FAILURE;
}
if (resultPtr != (Address) NIL) {
bzero(resultPtr, length);
}
numNeeded = 0;
numAvail = length / sizeof (RecovPerHostInfo);
infoPtr = (RecovPerHostInfo *) resultPtr;
Hash_StartSearch(&hashSearch);
for (hashPtr = Hash_Next(recovHashTable, &hashSearch);
hashPtr != (Hash_Entry *) NIL;
hashPtr = Hash_Next(recovHashTable, &hashSearch)) {
hostPtr = (RecovHostState *)hashPtr->value;
/*
* We need one slot for each host, whether numTries is 0 or 1, plus
* additional slots for each numTries over 1.
*/
numNeeded++;
if (hostPtr->numTries > 1) {
numNeeded += (hostPtr->numTries - 1);
}
if (numNeeded > numAvail) {
continue;
}
/* Why didn't Brent use GetValue()??? */
if (hostPtr != (RecovHostState *) NIL) {
RecovStampList *stampPtr;
/* Copy info into buffer */
infoPtr->spriteID = hostPtr->spriteID;
infoPtr->numTries = hostPtr->numTries;
Timer_GetRealTimeFromTicks(hostPtr->start,
&(infoPtr->start), (int *) NIL, (Boolean *) NIL);
Timer_GetRealTimeFromTicks(hostPtr->finished,
&(infoPtr->finished), (int *)NIL, (Boolean *) NIL);
infoPtr->numHandles = hostPtr->numHandles;
infoPtr->numSuccessful = hostPtr->numSuccessful;
LIST_FORALL(&hostPtr->timeStampList, (List_Links *) stampPtr) {
infoPtr++;
Timer_GetRealTimeFromTicks(stampPtr->start,
&infoPtr->start, (int *) NIL, (Boolean *) NIL);
Timer_GetRealTimeFromTicks(stampPtr->finished,
&infoPtr->finished, (int *) NIL, (Boolean *) NIL);
infoPtr->numHandles = stampPtr->numHandles;
infoPtr->numSuccessful = stampPtr->numSuccessful;
}
}
infoPtr++;
}
*lengthNeededPtr = numNeeded * sizeof (RecovPerHostInfo);
UNLOCK_MONITOR;
return SUCCESS;
}
/*
*----------------------------------------------------------------------
*
* RecovRebootCallBacks --
*
* This calls the call-back procedures installed by other modules
* via Recov_RebootRegister. It is invoked asynchronously from
* Recov_HostAlive when that procedure detects a reboot.
*
* Results:
* None.
*
* Side effects:
* Invoke the call-backs.
*
*----------------------------------------------------------------------
*/
/*ARGSUSED*/
void
RecovRebootCallBacks(data, callInfoPtr)
ClientData data;
Proc_CallInfo *callInfoPtr;
{
List_Links notifyList;
register NotifyElement *notifyPtr;
register int spriteID = (int)data;
GetRebootList(¬ifyList, spriteID);
recov_Stats.reboots++;
while (!List_IsEmpty(¬ifyList)) {
notifyPtr = (NotifyElement *)List_First(¬ifyList);
(*notifyPtr->proc)(spriteID, notifyPtr->data);
List_Remove((List_Links *)notifyPtr);
free((Address)notifyPtr);
}
CallBacksDone(spriteID);
return;
}
/*
*----------------------------------------------------------------------
*
* GetRebootList --
*
* Copy out the list of reboot callbacks. The list is protected by
* a monitor, but we don't want to call any recovery procedures from
* inside that monitor so we make a copy.
*
* Results:
* None.
*
* Side effects:
* Copy the reboot list off the host state table and return it
* to our caller who should free up the copied elements.
*
*----------------------------------------------------------------------
*/
ENTRY static void
GetRebootList(notifyListHdr, spriteID)
List_Links *notifyListHdr;
int spriteID;
{
register Hash_Entry *hashPtr;
register RecovHostState *hostPtr;
register NotifyElement *notifyPtr;
register NotifyElement *newNotifyPtr;
LOCK_MONITOR;
hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
hostPtr = (RecovHostState *)hashPtr->value;
List_Init(notifyListHdr);
LIST_FORALL(&hostPtr->rebootList, (List_Links *)notifyPtr) {
newNotifyPtr = (NotifyElement *) malloc(sizeof (NotifyElement));
newNotifyPtr->proc = notifyPtr->proc;
newNotifyPtr->data = notifyPtr->data;
List_InitElement((List_Links *)newNotifyPtr);
List_Insert((List_Links *)newNotifyPtr, LIST_ATREAR(notifyListHdr));
}
UNLOCK_MONITOR;
return;
}
/*
*----------------------------------------------------------------------
*
* CallBacksDone --
*
* Clear the internal state bit that says callbacks are in progress.
* This checks to see if there was a communication failure during
* the reboot callbacks. If so, the WANT_RECOVERY bit is set
* to ensure that another set of reboot callbacks are made.
*
* Results:
* None.
*
* Side effects:
* Clears RECOV_REBOOT_CALLBACKS and RECOV_FAILURE. May set
* RECOV_WANT_RECOVERY if RECOV_FAILURE was set.
*
*----------------------------------------------------------------------
*/
ENTRY static void
CallBacksDone(spriteID)
int spriteID;
{
register Hash_Entry *hashPtr;
register RecovHostState *hostPtr;
LOCK_MONITOR;
hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
hostPtr = (RecovHostState *)hashPtr->value;
hostPtr->state &= ~RECOV_REBOOT_CALLBACKS;
if (hostPtr->state & (RECOV_FAILURE)) {
/*
* There has been a communication failure during the reboot callbacks.
*/
hostPtr->numFailures++;
hostPtr->state &= ~RECOV_FAILURE;
hostPtr->state |= RECOV_WANT_RECOVERY;
} else {
hostPtr->numFailures = 0;
}
UNLOCK_MONITOR;
return;
}
/*
*----------------------------------------------------------------------
*
* CrashCallBacks --
*
* Invoked asynchronously so that other modules
* can clean up behind the crashed host. When done the host
* is marked as having recovery complete. This unblocks server
* processes stalled in Recov_HostAlive.
*
* Results:
* None.
*
* Side effects:
* Invoke the crash call-backs.
* Clears the recovery in progress flag checked in Recov_HostAlive.
*
*----------------------------------------------------------------------
*/
static void
CrashCallBacks(data, callInfoPtr)
ClientData data;
Proc_CallInfo *callInfoPtr;
{
register NotifyElement *notifyPtr;
register int spriteID = (int)data;
recov_Stats.crashes++;
LIST_FORALL(&crashCallBackList, (List_Links *)notifyPtr) {
if (notifyPtr->proc != (void (*)())NIL) {
(*notifyPtr->proc)(spriteID, notifyPtr->data);
}
}
MarkRecoveryComplete(spriteID);
RECOV_TRACE(spriteID, RECOV_CRASH, RECOV_CUZ_DONE);
callInfoPtr->interval = 0; /* Don't call again */
return;
}
#ifdef dying_state
/*
*----------------------------------------------------------------------
*
* DelayedCrashCallBacks --
*
* Invoked asynchronously from Recov_HostDead. This is called after
* a grace period defined by recov_CrashDelay so that, for example,
* clients can be debugged without having the server close all
* their files. When a client reboots, hoever, the crash callbacks
* will be sure to be called so other modules can clean up.
*
* Results:
* None.
*
* Side effects:
* Invoke the crash call-backs.
* Clears the recovery in progress flag checked in Recov_HostAlive.
*
*----------------------------------------------------------------------
*/
static void
DelayedCrashCallBacks(data, callInfoPtr)
ClientData data;
Proc_CallInfo *callInfoPtr;
{
register NotifyElement *notifyPtr;
register int spriteID = (int)data;
int state;
state = Recov_GetHostState(spriteID);
if (state & RECOV_HOST_DYING) {
RecovHostPrint(RECOV_PRINT_CRASH, spriteID,
"crash call-backs being made\n");
recov_Stats.crashes++;
MarkHostDead(spriteID);
LIST_FORALL(&crashCallBackList, (List_Links *)notifyPtr) {
if (notifyPtr->proc != (void (*)())NIL) {
(*notifyPtr->proc)(spriteID, notifyPtr->data);
}
}
MarkRecoveryComplete(spriteID);
} else if ((state & RECOV_HOST_DEAD) == 0) {
recov_Stats.nonCrashes++;
}
callInfoPtr->interval = 0; /* Don't call again */
return;
}
#endif /* dying_state */
/*
*----------------------------------------------------------------------
*
* MarkRecoveryComplete --
*
* The recovery call-backs have completed, and this procedure's
* job is to mark that fact in the host hash table and to notify
* any processes that are blocked in Recov_HostAlive waiting for this.
*
* Results:
* None.
*
* Side effects:
* Sets the state, if any, in the host state table.
* Notifies the hostPtr->recovery condition
*
*----------------------------------------------------------------------
*/
ENTRY static void
MarkRecoveryComplete(spriteID)
int spriteID;
{
register Hash_Entry *hashPtr;
register RecovHostState *hostPtr;
LOCK_MONITOR;
hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
if (hashPtr != (Hash_Entry *)NIL) {
hostPtr = (RecovHostState *)hashPtr->value;
if (hostPtr != (RecovHostState *)NIL) {
hostPtr->state &= ~RECOV_CRASH_CALLBACKS;
Sync_Broadcast(&hostPtr->recovery);
}
}
UNLOCK_MONITOR;
return;
}
#ifdef dying_state
/*
*----------------------------------------------------------------------
*
* MarkHostDead --
*
* Monitored procedure to change a host's state from dying to dead.
* This is done after the grace period has expired and we are
* about to call the crash callbacks.
*
* Results:
* None.
*
* Side effects:
* Set the state to RECOV_HOST_DEAD
*
*----------------------------------------------------------------------
*/
ENTRY static void
MarkHostDead(spriteID)
int spriteID;
{
register Hash_Entry *hashPtr;
register RecovHostState *hostPtr;
LOCK_MONITOR;
hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
if (hashPtr != (Hash_Entry *)NIL) {
hostPtr = (RecovHostState *)hashPtr->value;
if (hostPtr != (RecovHostState *)NIL) {
hostPtr->state &= ~RECOV_HOST_DYING;
hostPtr->state |= RECOV_HOST_DEAD;
}
}
UNLOCK_MONITOR;
return;
}
#endif
/*
*----------------------------------------------------------------------
*
* Recov_GetHostState --
*
* This looks into the host table to see and provides a guess
* as to the host's current state. It uses a timestamp kept in
* the host state to see if there's been recent message traffic.
* If so, RECOV_HOST_ALIVE is returned. If not, RECOV_STATE_UNKNOWN
* is returned and the caller should ping to make sure. Finally,
* if it is known that the host is down already, then RECOV_HOST_DEAD
* is returned.
*
* Results:
* RECOV_STATE_UNKNOWN if the caller should ping to make sure.
* RECOV_HOST_ALIVE if the host is up (recent message traffic).
* RECOV_HOST_DEAD if the host is down (recent timeouts).
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
ENTRY int
Recov_GetHostState(spriteID)
int spriteID;
{
register Hash_Entry *hashPtr;
register RecovHostState *hostPtr;
register int state = RECOV_STATE_UNKNOWN;
Time time;
LOCK_MONITOR;
hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
if (hashPtr != (Hash_Entry *)NIL) {
hostPtr = (RecovHostState *)hashPtr->value;
if (hostPtr != (RecovHostState *)NIL) {
state = hostPtr->state &
(RECOV_HOST_ALIVE|RECOV_HOST_BOOTING|RECOV_HOST_DYING|RECOV_HOST_DEAD);
if (state & (RECOV_HOST_ALIVE|RECOV_HOST_BOOTING)) {
/*
* Check for recent message traffic before admitting
* that the other machine is up.
*/
Timer_GetTimeOfDay(&time, (int *)NIL, (Boolean *)NIL);
Time_Subtract(time, hostPtr->time, &time);
if (Time_GT(time, time_TenSeconds)) {
state = RECOV_STATE_UNKNOWN;
}
}
}
}
UNLOCK_MONITOR;
return(state);
}
/*
*----------------------------------------------------------------------
*
* RecovGetLastHostState --
*
* This looks into the host table to pass back the
* host's current state. It just uses whatever state the
* host has marked currently, and does no further interpretation.
*
* Results:
* hostPtr->state
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
ENTRY int
RecovGetLastHostState(spriteID)
int spriteID;
{
register Hash_Entry *hashPtr;
register RecovHostState *hostPtr;
register int state = RECOV_STATE_UNKNOWN;
LOCK_MONITOR;
hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
if (hashPtr != (Hash_Entry *)NIL) {
hostPtr = (RecovHostState *)hashPtr->value;
if (hostPtr != (RecovHostState *)NIL) {
state = hostPtr->state;
}
}
UNLOCK_MONITOR;
return(state);
}
/*
*----------------------------------------------------------------------
*
* RecovCheckHost --
*
* This decides if we should check up on a host. If there has
* been recent message traffic there is no need to ping now,
* but we should check again later. If there has been no
* message traffic our caller should ping. Finally, if
* there are no reboot callbacks associated with the host,
* then we are not interested anymore. Thus there are three
* values to return.
*
* Results:
* -1 if we are no longer interested in the host.
* 0 if the host is presumably up and we don't have to ping.
* 1 if our caller should ping.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
ENTRY int
RecovCheckHost(spriteID)
int spriteID;
{
register Hash_Entry *hashPtr;
register RecovHostState *hostPtr = (RecovHostState *)NIL;
register int check = -1; /* forget about the host */
register int state;
LOCK_MONITOR;
hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
if (hashPtr != (Hash_Entry *)NIL) {
hostPtr = (RecovHostState *)hashPtr->value;
if ((hostPtr != (RecovHostState *)NIL) &&
(!List_IsEmpty(&hostPtr->rebootList))) {
state = hostPtr->state &
(RECOV_HOST_ALIVE|RECOV_HOST_BOOTING|RECOV_HOST_DYING|RECOV_HOST_DEAD);
if (state & (RECOV_HOST_ALIVE|RECOV_HOST_BOOTING)) {
/*
* Check for recent message traffic before admitting
* that the other machine is up.
*/
Time time;
Timer_GetTimeOfDay(&time, (int *)NIL, (Boolean *)NIL);
Time_Subtract(time, hostPtr->time, &time);
if (Time_GT(time, time_TenSeconds)) {
check = 1; /* ping the host now */
} else {
check = 0; /* ping the host maybe next time */
}
} else if (state & (RECOV_HOST_DEAD|RECOV_HOST_DYING)) {
check = 1; /* ping the host now */
}
}
}
if (check < 0 && hostPtr != (RecovHostState *)NIL) {
hostPtr->state &= ~RECOV_PINGING_HOST;
}
UNLOCK_MONITOR;
return(check);
}
/*
*----------------------------------------------------------------------
*
* Recov_GetStats --
*
* Return the Recov_Stats to user-level, and perhaps more information
* about our internal opinion of other hosts.
*
* Results:
* None.
*
* Side effects:
* Copies data out to user-space.
*
*----------------------------------------------------------------------
*/
ReturnStatus
Recov_GetStats(size, userAddr)
int size;
Address userAddr;
{
ReturnStatus status;
int extraSpace = -1;
if (size <= 0) {
return(GEN_INVALID_ARG);
}
/*
* See if the caller wants more than just statistics.
*/
if (size > sizeof(Recov_Stats)) {
extraSpace = size - sizeof(Recov_Stats);
size = sizeof(Recov_Stats);
}
status = Vm_CopyOut(size, (Address)&recov_Stats, userAddr);
#ifdef notdef
if (extraSpace > sizeof(int)) {
/*
* Fill the user-space buffer with a count of hosts,
* and then information about each host.
*/
userAddr += sizeof(Recov_Stats);
status = Recov_DumpState(extraSpace, userAddr);
}
#endif notdef
return(status);
}
/*
*----------------------------------------------------------------------
*
* Recov_DumpState --
*
* Dump internal state to user-level.
*
* Results:
* None.
*
* Side effects:
* Copies data out to user-space.
*
*----------------------------------------------------------------------
*/
ReturnStatus
Recov_DumpState(size, userAddr)
int size;
Address userAddr;
{
ReturnStatus status = SUCCESS;
int numHosts, maxHosts;
int *countPtr;
int spriteID;
Recov_State recovState;
/*
* We return a count, plus count number of Recov_State structures.
*/
maxHosts = (size - sizeof(int)) / sizeof(Recov_State);
countPtr = (int *)userAddr;
if ((maxHosts == 0) && (size > sizeof(int))) {
status = Vm_CopyOut(sizeof(int), (Address)&maxHosts, (Address)countPtr);
return(status);
}
userAddr += sizeof(int);
/*
* Brute force. Run through til MAX_HOSTS and try to grab
* the state from the hash table.
*/
numHosts = 0;
for (spriteID = 1 ; spriteID < NET_NUM_SPRITE_HOSTS ; spriteID++) {
if (Recov_GetHostInfo(spriteID, &recovState)) {
status = Vm_CopyOut(sizeof(recovState), (Address)&recovState,
userAddr);
if (status != SUCCESS) {
return(status);
}
userAddr += sizeof(recovState);
numHosts++;
if (numHosts >= maxHosts) {
break;
}
}
}
return(status);
}
/*
*----------------------------------------------------------------------
*
* Recov_GetHostInfo --
*
* Get the internal state about a host.
*
* Results:
* Fills in a Recov_State structure and returns TRUE,
* otherwise, if we don't know about the host, returns FALSE
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
ENTRY Boolean
Recov_GetHostInfo(spriteID, recovStatePtr)
int spriteID;
Recov_State *recovStatePtr;
{
Hash_Entry *hashPtr;
RecovHostState *hostPtr;
Boolean found = FALSE;
LOCK_MONITOR;
if (spriteID <= 0 || spriteID == rpc_SpriteID) {
goto exit;
} else {
hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
if (hashPtr == (Hash_Entry *)NULL || hashPtr->value == (Address)NIL) {
goto exit;
} else {
hostPtr = (RecovHostState *)hashPtr->value;
}
recovStatePtr->spriteID = spriteID;
recovStatePtr->state = hostPtr->state;
recovStatePtr->clientState = hostPtr->clientState;
recovStatePtr->bootID = hostPtr->bootID;
recovStatePtr->time = hostPtr->time;
found = TRUE;
}
exit:
UNLOCK_MONITOR;
return(found);
}
/*
*----------------------------------------------------------------------
*
* Recov_PrintTraceRecord --
*
* Format and print the client data part of a recovery trace record.
*
* Results:
* None.
*
* Side effects:
* printf to the display.
*
*----------------------------------------------------------------------
*/
int
Recov_PrintTraceRecord(clientData, event, printHeaderFlag)
ClientData clientData; /* Client data in the trace record */
int event; /* Type, or event, from the trace record */
Boolean printHeaderFlag; /* If TRUE, a header line is printed */
{
RecovTraceRecord *recPtr = (RecovTraceRecord *)clientData;
char name[128];
if (printHeaderFlag) {
/*
* Print column headers and a newline.
*/
printf("%10s %10s %17s\n", "Host", "State", "Event ");
}
if (clientData != (ClientData)NIL) {
Net_SpriteIDToName(recPtr->spriteID, 128, name);
if (*name == '\0') {
printf("%10d ", recPtr->spriteID);
} else {
printf("%10s ", name);
}
printf("%-8s", GetState(recPtr->state));
printf("%3s", (recPtr->state & RECOV_CRASH_CALLBACKS) ?
" C " : " ");
printf("%3s", (recPtr->state & RECOV_PINGING_HOST) ?
" P " : " ");
printf("%3s", (recPtr->state & RECOV_REBOOT_CALLBACKS) ?
" R " : " ");
printf("%3s", (recPtr->state & RECOV_WANT_RECOVERY) ?
" W " : " ");
switch(event) {
case RECOV_CUZ_WAIT:
printf("waiting");
break;
case RECOV_CUZ_WAKEUP:
printf("wakeup");
break;
case RECOV_CUZ_INIT:
printf("init");
break;
case RECOV_CUZ_REBOOT:
printf("reboot");
break;
case RECOV_CUZ_CRASH:
printf("crash");
break;
case RECOV_CUZ_CRASH_UNDETECTED:
printf("crash undetected");
break;
case RECOV_CUZ_DONE:
printf("done");
break;
case RECOV_CUZ_PING_ASK:
printf("ping (ask)");
break;
case RECOV_CUZ_PING_CHK:
printf("ping (check)");
break;
case RECOV_TRACE_FS_STALE:
printf("stale FS handle");
break;
default:
printf("(%x)", event);
break;
}
/* Our caller prints a newline */
}
return 0;
}
/*
*----------------------------------------------------------------------
*
* Recov_PrintTrace --
*
* Dump out the recovery trace. Called via a console L1 keystroke.
*
* Results:
* None.
*
* Side effects:
* Prints to the console.
*
*----------------------------------------------------------------------
*/
void
Recov_PrintTrace(clientData)
ClientData clientData;
{
int numRecs = (int)clientData;
if (numRecs <= 0 || numRecs > recovTraceLength) {
numRecs = recovTraceLength;
}
printf("RECOVERY TRACE\n");
(void)Trace_Print(recovTraceHdrPtr, numRecs, Recov_PrintTraceRecord);
Recov_PrintState();
RecovPrintPingList();
return;
}
/*
*----------------------------------------------------------------------
*
* Recov_PrintState --
*
* Dump out the recovery state. Called via a console L1 keystroke.
*
* Results:
* None.
*
* Side effects:
* Prints to the console.
*
*----------------------------------------------------------------------
*/
void
Recov_PrintState()
{
Hash_Search hashSearch;
register Hash_Entry *hashEntryPtr;
register RecovHostState *hostPtr;
char hostName[128];
Time_Parts timeParts;
Time bootTime;
int localOffset; /* minute offset for our tz */
Time currentTime;
printf("RECOVERY STATE\n");
Hash_StartSearch(&hashSearch);
for (hashEntryPtr = Hash_Next(recovHashTable, &hashSearch);
hashEntryPtr != (Hash_Entry *)NIL;
hashEntryPtr = Hash_Next(recovHashTable, &hashSearch)) {
hostPtr = (RecovHostState *)hashEntryPtr->value;
if (hostPtr != (RecovHostState *)NIL) {
Net_SpriteIDToName(hostPtr->spriteID, 128, hostName);
printf("%-14s %-8s", hostName, GetState(hostPtr->state));
printf(" bootID 0x%8x", hostPtr->bootID);
/*
* Print out boot time in our timezone.
*/
Timer_GetTimeOfDay(¤tTime, &localOffset, (Boolean *) NIL);
bootTime.seconds = hostPtr->bootID;
bootTime.microseconds = 0;
bootTime.seconds += (localOffset * 60);
Time_ToParts(bootTime.seconds, FALSE, &timeParts);
timeParts.month++; /* So Jan is 1, not 0 */
printf(" %d/%d/%d %d:%02d:%02d ", timeParts.month,
timeParts.dayOfMonth,
timeParts.year, timeParts.hours, timeParts.minutes,
timeParts.seconds);
/*
* Print seconds ago we last heard from host.
*/
printf(" %d ", currentTime.seconds - hostPtr->time.seconds);
PrintExtraState(hostPtr);
printf("\n");
}
}
return;
}
/*
*----------------------------------------------------------------------
*
* GetState --
*
* Return a printable string for the host's state.
*
* Results:
* A pointer to a string.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static char *
GetState(state)
int state;
{
switch(state & (RECOV_HOST_ALIVE|RECOV_HOST_DYING|RECOV_HOST_DEAD|
RECOV_HOST_BOOTING)) {
default:
case RECOV_STATE_UNKNOWN:
return("Unknown");
case RECOV_HOST_ALIVE:
return("Alive");
case RECOV_HOST_BOOTING:
return("Booting");
case RECOV_HOST_DYING:
return("Dying");
case RECOV_HOST_DEAD:
return("Dead");
}
}
/*
*----------------------------------------------------------------------
*
* RecovExtraState --
*
* Prints out strings for various auxilliary state bits.
*
* Results:
* None.
*
* Side effects:
* Prints out stuff.
*
*----------------------------------------------------------------------
*/
static void
PrintExtraState(hostPtr)
RecovHostState *hostPtr;
{
if (hostPtr->state & RECOV_CRASH_CALLBACKS) {
printf("Crash callbacks ");
}
if (hostPtr->state & RECOV_WANT_RECOVERY) {
printf("Want recovery ");
}
if (hostPtr->state & RECOV_REBOOT_CALLBACKS) {
printf("Reboot callbacks ");
}
if (hostPtr->state & RECOV_FAILURE) {
printf("Failure ");
}
if (hostPtr->clientState & CLT_RECOV_IN_PROGRESS) {
printf("Clt-inprogress ");
}
if (hostPtr->clientState & SRV_RECOV_IN_PROGRESS) {
printf("Srv-inprogress ");
}
}
void
Recov_ChangePrintLevel(newLevel)
int newLevel;
{
recov_PrintLevel = newLevel;
return;
}